# Code for extracting words about character in a book.
# This code can then be reiterated on a larger corpus.
# This code doesn't scale particularly well, but it works for a corpus of 10-20k books.
# I intend to make this a package down the line, so others can import this code for their own purposes.

# Code has three overarching steps
# 1) Take in a text file, extract prose, chunk sentences for parsing. (TextGatherer Class)
# 2) Annotate each text for coreference and dependency data using spaCy libraries. (Annotator Class)
# 3) Connect coreference and dependency data, outputting a dataframe of words describing characters.
#    (CharacterArchiver Class)

# main() function below will give you a sense of the inputs and outputs of each step.
# I recommend reading that first, before diving into the classes themselves.

# import libraries
import os
from nltk import word_tokenize, sent_tokenize
import spacy
import neuralcoref
import pandas as pd
import numpy as np

pd.options.display.float_format = '{:,.0f}'.format


class TextGatherer:
    def __init__(self, path_to_file):
        self.path_to_file = path_to_file
        self.file_id = os.path.basename(path_to_file)
        with open(path_to_file, 'rt', encoding="utf8") as file_in:
            self.text = file_in.read()
        self.tokenized_text = word_tokenize(self.text)
        self.parent_directory = os.path.abspath(os.path.join(self.path_to_file, os.pardir))

    def parse_prose(self):
        # let's set up some lists to hold our pieces of narrative and dialog
        parsed_dialog = []
        parsed_prose = []
        # and this list will be a bucket for the text we're currently exploring
        current = []

        # now let's set up values that will help us loop through the text
        length = len(self.tokenized_text)
        counter = 0
        quote_open, quote_close = '“', '”'

        # now we'll start our loop saying that as long as our sentence is...
        while counter < length:
            word = self.tokenized_text[counter]
            cond1 = quote_open not in word
            cond2 = quote_close not in word

            # until we find a quotation mark, we're working with narrative
            if cond1 and cond2:
                current.append(word)

            # here's what we do when we find a closed quote
            else:
                # we append the narrative we've collected & clear our our
                # current variable
                parsed_prose.append(current)
                current = list()
                # now current is ready to hold dialog and we're working on
                # a piece of dialog
                current.append(word)
                found_q = True

                # while we're in the quote, we're going to increment the counter
                # and append to current in this while loop
                while found_q and counter < length - 1:
                    counter += 1
                    if quote_close not in self.tokenized_text[counter]:
                        current.append(self.tokenized_text[counter])
                    else:
                        # if we find a closing quote, we add our dialog to the
                        # appropriate list, clear current and flip our found_q
                        # variable to False
                        current.append(self.tokenized_text[counter])
                        parsed_dialog.append(current)
                        current = list()
                        found_q = False

            # increment the counter to move us through the text
            counter += 1

        return parsed_prose

    @staticmethod
    def get_prose_string(parsed_prose):
        prose_list = [x for x in parsed_prose if x]
        prose_string = " ".join(str(r) for v in prose_list for r in v)
        return prose_string

    @staticmethod
    def get_sentences(prose_string):
        sentences = sent_tokenize(prose_string)
        return sentences

    @staticmethod
    def chunk_sentences(sentences):
        sent_chunks = [sentences[x:x + 2] for x
                       in range(0, len(sentences), 2)]
        str_chunks = []
        for chunk in sent_chunks:
            temp = " ".join(chunk)
            str_chunks.append(temp)
        return str_chunks


class Annotator:
    def __init__(self, str_chunks, parser):
        self.str_chunks = str_chunks
        self.parser = parser

    def spacy_init(self):
        doc = []
        for chunk in self.str_chunks:
            doc.append(self.parser(chunk))
        return doc

    ##########
    @staticmethod
    def make_entities(doc):
        test = [[[(token.text, token.label_) for token in sent.ents]
                 for sent in chunk.sents]
                for chunk in doc]
        list_flatten = [val
                        for sublist in test
                        for val in sublist]
        entities_list = [val
                         for sublist in list_flatten
                         for val in sublist]
        temp_df = pd.DataFrame(entities_list, columns=['entity', 'entity_type'])
        temp1 = temp_df[temp_df.entity_type.str.contains('PERSON')].drop_duplicates()
        entities_df = temp1.assign(entity=temp1.entity.str.lower())
        return entities_df

    @staticmethod
    def make_dependencies(doc):
        dependencies = ("nsubj", "poss", "dobj", "amod", "acomp", "attr", "conj")
        unpacked_doc = [[[(cid, sid, token.i, token.head.i, token.dep_,
                           token.text, token.lemma_, token.head.text, token.head.lemma_)
                          for token in sent if token.dep_ in dependencies]
                         for sid, sent in enumerate(chunk.sents)]
                        for cid, chunk in enumerate(doc, 1)]
        list_flatten = [val
                        for sublist in unpacked_doc
                        for val in sublist]
        dependencies_list = [val
                             for sublist in list_flatten
                             for val in sublist]

        dependency_df = pd.DataFrame(dependencies_list, columns=['cids', 'sids', 'tid', 'tid_target',
                                                                 'relation', 'word', 'lemma',
                                                                 'word_target', 'lemma_target'])
        return dependency_df

    @staticmethod
    def make_coreferences(doc):
        unpacked_doc = [[[(cid, cluster.i, mention.start, mention.text, str(mention._.coref_cluster.main))
                          for mention in cluster.mentions]
                         for cluster in chunk._.coref_clusters]
                        for cid, chunk in enumerate(doc, 1)]
        list_flatten = [val
                        for sublist in unpacked_doc
                        for val in sublist]
        coreferences_list = [val
                             for sublist in list_flatten
                             for val in sublist]
        coreference_df = pd.DataFrame(coreferences_list, columns=['cids', 'rid', 'tid',
                                                                  'entity', 'main_name'])
        return coreference_df


class CharacterArchiver:
    def __init__(self, coreference_df, dependency_df, entities_df, path_to_anatomy_lexicon):
        self.coreference_df = coreference_df
        self.dependency_df = dependency_df
        self.entities_df = entities_df
        with open(path_to_anatomy_lexicon, 'rt', encoding="utf8") as file_in:
            self.anatomy_lexicon = file_in.read().split(", ")

    def cleaning_dependencies(self):
        punct = '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{}~’'
        transtab = str.maketrans(dict.fromkeys(punct, ''))
        keep_cols = ['cids', 'sids', 'tid', 'tid_target', 'relation',
                     'word', 'lemma', 'word_target', 'lemma_target']
        punct_fix = self.dependency_df.assign(lemma='|'.join(self.dependency_df['lemma'].tolist())
                                                       .translate(transtab)
                                                       .split('|'),
                                              lemma_target='|'.join(self.dependency_df['lemma_target'].tolist())
                                                              .translate(transtab)
                                                              .split('|'))
        ws_fix = punct_fix.assign(lemma=punct_fix.lemma.str.lstrip(),
                                  lemma_target=punct_fix.lemma_target.str.lstrip())
        word_lc = ws_fix.assign(word=ws_fix.word.str.lower(),
                                lemma=ws_fix.lemma.str.lower(),
                                word_target=ws_fix.word_target.str.lower(),
                                lemma_target=ws_fix.lemma_target.str.lower())
        name_length = word_lc.assign(lemma_length=word_lc['lemma'].apply(len),
                                     target_length=word_lc['lemma_target'].apply(len))
        length_cond = ((name_length.lemma_length > 1) &
                       (name_length.target_length > 1))
        word_fix = name_length[length_cond]
        fixed_dependency_df = word_fix[keep_cols]
        return fixed_dependency_df

    def cleaning_coreference(self):
        rmv_words = ['“', '_', "I", "You", "you", "He", "he", "She", "she", "It", "it",
                     "They", "they", "Me", "me", "Him", "him", "Her", "her", "My",
                     "my", "His", "his", "Your", "your", "Yours", "yours", "Us", "us",
                     "Them", "them", "They", "they", "We", "we", "Theirs", "theirs",
                     "Their", "their", "Our", "our", "Ours", "ours",
                     "PARAGRAPH", "HE", "HER", 'All', 'all', 'this']
        punct = '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{}~’'
        transtab = str.maketrans(dict.fromkeys(punct, ''))
        keep_cols = ['cids', 'rid', 'tid', 'pronoun', 'name']
        punct_fix = self.coreference_df.assign(name='|'.join(self.coreference_df['main_name'].tolist())
                                                       .translate(transtab)
                                                       .split('|'),
                                               pronoun='|'.join(self.coreference_df['entity'].tolist())
                                                          .translate(transtab)
                                                          .split('|'))
        ws_fix = punct_fix.assign(name=punct_fix.name.str.lstrip(),
                                  pronoun=punct_fix.pronoun.str.lstrip())
        name_length = ws_fix.assign(name_length=ws_fix['name'].apply(len),
                                    pronoun_length=ws_fix['pronoun'].str.split().apply(len))
        name_fix = name_length[name_length.name_length > 1]
        tid_fix = name_fix.assign(tid=np.where(name_fix.pronoun_length == 2,
                                               name_fix.tid + 1,
                                               name_fix.tid))
        word_rmv = tid_fix[~tid_fix.name.isin(rmv_words)]
        word_lc = word_rmv.assign(name=word_rmv.name.str.lower(),
                                  pronoun=word_rmv.pronoun.str.lower())
        coref_index = word_lc.reset_index()
        fixed_coreference_df = coref_index[keep_cols]
        return fixed_coreference_df

    @staticmethod
    def merge_character_words(fixed_coreference_df, fixed_dependency_df):
        joiners = ['cids', 'tid']
        coref_character = pd.merge(fixed_coreference_df, fixed_dependency_df,
                                   on=joiners,
                                   how='left')
        character_words_df = coref_character.dropna(axis='rows',
                                                    how='any')
        return character_words_df

    def augment_character_words(self, fixed_coreference_df, fixed_dependency_df, character_words_df):
        #search lemmas for coreference names that were missed
        temp = fixed_dependency_df[fixed_dependency_df.lemma.isin(fixed_coreference_df.name)]
        temp1 = temp[temp.lemma.isin(self.entities_df.entity)]
        joiners = ['cids', 'tid']
        temp2 = temp1.merge(fixed_coreference_df,
                            how='left',
                            on=joiners)
        temp3 = temp2[temp2.name.isnull()]
        temp3['pronoun'] = temp3['pronoun'].fillna(temp3['lemma'])
        temp3['name'] = temp3['name'].fillna(temp3['lemma'])
        temp3['rid'] = temp3['rid'].fillna(0)

        aug_character_words_df = character_words_df.append(temp3) \
                                                   .sort_values(by=['cids', 'sids', 'tid']) \
                                                   .reset_index(drop=True)

        # search names for entities labeled as people
        temp4 = aug_character_words_df[aug_character_words_df.name.isin(self.entities_df.entity)]

        # remove duplicates
        dedup_aug_characters_df = temp4.drop_duplicates(subset=['cids', 'sids', 'tid', 'tid_target'])
        return dedup_aug_characters_df

    def find_anatomical_characteristics(self, dedup_aug_characters_df, fixed_dependency_df):
        body_filter = dedup_aug_characters_df.lemma_target.isin(self.anatomy_lexicon)
        body_df = dedup_aug_characters_df[body_filter]

        # get body agent
        agent_condition1 = fixed_dependency_df.cids.isin(body_df.cids)
        agent_condition2 = fixed_dependency_df.sids.isin(body_df.sids)
        agent_condition3 = fixed_dependency_df.tid.isin(body_df.tid_target)
        agent_condition4 = fixed_dependency_df.relation.str.contains("nsubj")
        agent_condition5 = fixed_dependency_df.lemma.isin(self.anatomy_lexicon)
        agent_condition6 = ~fixed_dependency_df.lemma_target.str.contains("be")
        actions_df = fixed_dependency_df[agent_condition1 & agent_condition2 &
                                         agent_condition3 & agent_condition4 &
                                         agent_condition5 & agent_condition6]

        temp_actions_df = pd.merge(actions_df,
                                   body_df,
                                   how='left',
                                   left_on=['cids', 'sids', 'tid', 'lemma'],
                                   right_on=['cids', 'sids', 'tid_target', 'lemma_target'])

        keep_cols = ['name', 'relation_x', 'lemma_target_x']
        named_actions_df = temp_actions_df[keep_cols].rename(columns={'relation_x': 'relation',
                                                                      'lemma_target_x': 'lemma'}) \
            .assign(relation='agent')

        # get body patients
        # get dobj
        dobj_filter1 = fixed_dependency_df.cids.isin(body_df.cids)
        dobj_filter2 = fixed_dependency_df.sids.isin(body_df.sids)
        dobj_filter3 = fixed_dependency_df.tid.isin(body_df.tid_target)
        dobj_filter4 = fixed_dependency_df.relation.str.contains("dobj")
        dobj_filter5 = fixed_dependency_df.lemma.isin(self.anatomy_lexicon)
        dobj_df = fixed_dependency_df[dobj_filter1 & dobj_filter2 &
                                      dobj_filter3 & dobj_filter4 &
                                      dobj_filter5]

        temp_patients_df = pd.merge(dobj_df,
                                    body_df,
                                    how='left',
                                    left_on=['cids', 'sids', 'tid', 'lemma'],
                                    right_on=['cids', 'sids', 'tid_target', 'lemma_target'])

        keep_cols = ['name', 'relation_x', 'lemma_target_x']
        named_patients_df = temp_patients_df[keep_cols].rename(columns={'relation_x': 'relation',
                                                                        'lemma_target_x': 'lemma'})\
                                                       .assign(relation='patient')

        # get body predicatives
        # get amods
        amod_filter1 = fixed_dependency_df.cids.isin(body_df.cids)
        amod_filter2 = fixed_dependency_df.sids.isin(body_df.sids)
        amod_filter3 = fixed_dependency_df.tid_target.isin(body_df.tid_target)
        amod_filter4 = fixed_dependency_df.relation.str.contains("amod")
        amod_filter5 = fixed_dependency_df.lemma_target.isin(self.anatomy_lexicon)
        amod_df = fixed_dependency_df[amod_filter1 & amod_filter2 &
                                      amod_filter3 & amod_filter4 &
                                      amod_filter5]

        join_cols = 'cids', 'sids', 'tid_target', 'lemma_target'
        keep_cols = ['name', 'relation', 'lemma']
        named_amod_df = pd.merge(amod_df,
                                 body_df[['cids', 'sids', 'rid', 'tid_target', 'lemma_target', 'pronoun', 'name']],
                                 on=join_cols)[keep_cols]

        # get nsubj-acomp!
        # first nsubj-be rows
        nsubj_condition1 = fixed_dependency_df.cids.isin(body_df.cids)
        nsubj_condition2 = fixed_dependency_df.sids.isin(body_df.sids)
        nsubj_condition3 = fixed_dependency_df.tid.isin(body_df.tid_target)
        nsubj_condition4 = fixed_dependency_df.relation.str.contains("nsubj")
        nsubj_condition5 = fixed_dependency_df.lemma.isin(self.anatomy_lexicon)
        nsubj_condition6 = fixed_dependency_df.lemma_target.str.contains("be")

        nsubj_be_df = fixed_dependency_df[nsubj_condition1 & nsubj_condition2 &
                                          nsubj_condition3 & nsubj_condition4 &
                                          nsubj_condition5 & nsubj_condition6]

        temp_nsubj_be_df = pd.merge(nsubj_be_df,
                                    body_df,
                                    how='left',
                                    left_on=['cids', 'sids', 'tid', 'lemma'],
                                    right_on=['cids', 'sids', 'tid_target', 'lemma_target'])

        keep_cols = ['cids', 'sids', 'tid_x', 'tid_target_x', 'relation_x',
                     'word_x', 'lemma_x', 'word_target_x', 'lemma_target_x',
                     'name', 'pronoun']
        named_nsubj_be_df = temp_nsubj_be_df[keep_cols].rename(columns={'tid_x': 'tid',
                                                                        'tid_target_x': 'tid_target',
                                                                        'relation_x': 'relation',
                                                                        'word_x': 'word',
                                                                        'lemma_x': 'lemma',
                                                                        'word_target_x': 'word_target',
                                                                        'lemma_target_x': 'lemma_target'})

        # now find acomps from
        acomp_filter1 = fixed_dependency_df.cids.isin(named_nsubj_be_df.cids)
        acomp_filter2 = fixed_dependency_df.sids.isin(named_nsubj_be_df.sids)
        acomp_filter3 = fixed_dependency_df.tid_target.isin(named_nsubj_be_df.tid_target)
        acomp_filter4 = fixed_dependency_df.relation.str.contains("acomp")
        acomp_df = fixed_dependency_df[acomp_filter1 & acomp_filter2 & acomp_filter3 & acomp_filter4]

        temp_acomp_df = pd.merge(acomp_df,
                                 named_nsubj_be_df[['cids', 'tid_target', 'pronoun', 'name']],
                                 how='left')

        keep_cols = ['name', 'relation', 'lemma']
        named_acomp_df = temp_acomp_df[keep_cols]

        # combine predicative_dfs
        named_predicatives_df = named_amod_df.append(named_acomp_df).assign(relation='predicative')

        # combine new body words
        temp_body1 = pd.concat([named_actions_df, named_patients_df, named_predicatives_df])

        # transform already gotten body words #!#!#!CHANGE RELATION WORDS HERE
        keep_cols = ['name', 'relation', 'lemma_target']
        temp_body2 = body_df[keep_cols].rename(columns={'lemma_target': 'lemma'})

        temp_body2.loc[temp_body2.relation.str.contains('poss'), 'relation'] = 'possessive'
        temp_body2.loc[temp_body2.relation.str.contains('nsubj'), 'relation'] = 'agent'
        temp_body2.loc[temp_body2.relation.str.contains('dobj'), 'relation'] = 'patient'
        temp_body2.loc[temp_body2.relation.str.contains('amod'), 'relation'] = 'predicative'

        body_final_df = temp_body1.append(temp_body2).assign(characterization='anatomical').sort_values(by="name")

        return body_final_df

    def get_character_words(self, dedup_aug_characters_df, fixed_dependency_df):
        not_body_filter = ~dedup_aug_characters_df.lemma_target.isin(self.anatomy_lexicon)
        not_body_df = dedup_aug_characters_df[not_body_filter]

        # get predicative words
        # get acomp words
        nsubj_be_cond = ((not_body_df['relation'] == 'nsubj') &
                         (not_body_df['lemma_target'] == 'be'))
        be_acomp_cond = ((fixed_dependency_df['relation'] == 'acomp') &
                         (fixed_dependency_df['lemma_target'] == 'be'))
        joiners = ['cids', 'tid_target']
        keep_cols = ['name', 'lemma']
        nsubj_be = not_body_df.loc[nsubj_be_cond]
        be_acomp = fixed_dependency_df.loc[be_acomp_cond]
        char_acomp = pd.merge(nsubj_be, be_acomp,
                              on=joiners,
                              how='left')
        rename_acomp = char_acomp.rename(columns={"lemma_y": "lemma"})
        final_acomp = rename_acomp[keep_cols].dropna(axis='rows',
                                                     how='any')

        # get amod words
        amod_char_cond = (not_body_df['relation'] == 'amod')
        keep_cols = ['lemma', 'lemma_target']
        amod_char = not_body_df.loc[amod_char_cond]
        final_amod = amod_char[keep_cols].rename(columns={"lemma_target": "name"})

        # merge predicative words
        predicative_merge = pd.concat([final_acomp, final_amod])
        predicative_df = predicative_merge.assign(relation='pred')

        # get agent words
        coref_agent_condition = ((not_body_df['relation'] == 'nsubj') &
                                 (not_body_df['lemma_target'] != 'be'))
        keep_cols = ['name', 'lemma_target']
        word_agent = not_body_df[coref_agent_condition]
        coref_agent = word_agent[keep_cols]
        agent_df = coref_agent.rename(columns={'lemma_target': 'lemma'}).assign(relation='agent')

        # get possessive words
        coref_poss_condition = (not_body_df['relation'] == 'poss')
        keep_cols = ['name', 'lemma_target']
        word_poss = not_body_df[coref_poss_condition]
        coref_poss = word_poss[keep_cols]
        poss_df = coref_poss.rename(columns={'lemma_target': 'lemma'}).assign(relation='possessive')

        # get patient words
        coref_dobj_condition = (not_body_df['relation'] == 'dobj')
        keep_cols = ['name', 'lemma_target']
        word_dobj = not_body_df[coref_dobj_condition]
        coref_dobj = word_dobj[keep_cols]
        patient_df = coref_dobj.rename(columns={'lemma_target': 'lemma'}).assign(relation='patient')

        # combine all non anatomical words
        character_final_df = pd.concat([predicative_df, agent_df, poss_df, patient_df]) \
            .assign(characterization='not_anatomical').sort_values(by='name')
        return character_final_df

    @staticmethod
    def combine_all_character_words(body_final_df, character_final_df):
        all_character_words_df = body_final_df.append(character_final_df)
        return all_character_words_df


class MajorCharacterFinder:
    def __init__(self, all_character_words_df, threshold):
        self.character_data = all_character_words_df
        self.threshold = threshold

    def count_character_words(self):
        character_wordcount_df = pd.DataFrame(self.character_data['name'].value_counts())\
                                   .reset_index()\
                                   .rename(columns={"index": "name", "name": "count"})

        major_characters_df = character_wordcount_df[character_wordcount_df['count'] > self.threshold]

        major_character_words_df = self.character_data[self.character_data.name.isin(major_characters_df.name)]

        return major_character_words_df

# Classes End


# Wrapper Functions


def gather_text(path_to_file):
    # create parsing object
    gathered_text = TextGatherer(path_to_file)
    file_id = gathered_text.file_id
    parsed_prose = gathered_text.parse_prose()
    prose_string = gathered_text.get_prose_string(parsed_prose)
    sentences = gathered_text.get_sentences(prose_string)
    gathered_sentences = gathered_text.chunk_sentences(sentences)
    return gathered_sentences, file_id


def annotate_text(gathered_sentences):
    # setup spacy parser
    parser = spacy.load('en_core_web_lg')
    coreferencer = neuralcoref.NeuralCoref(parser.vocab)
    parser.add_pipe(coreferencer, name='neuralcoref')
    # create annotation object
    annotated_text = Annotator(gathered_sentences, parser)
    # annotate texts
    spacy_annotations = annotated_text.spacy_init()
    # save annotations to dataframe structure
    coreference_df = annotated_text.make_coreferences(spacy_annotations)
    dependency_df = annotated_text.make_dependencies(spacy_annotations)
    entities_df = annotated_text.make_entities(spacy_annotations)
    return coreference_df, dependency_df, entities_df


def record_characters(coreference_df, dependency_df, entities_df, path_to_anatomy_lexicon):
    # create object for getting character data
    record = CharacterArchiver(coreference_df, dependency_df, entities_df, path_to_anatomy_lexicon)
    # clean annotation data
    fixed_dependency_df = record.cleaning_dependencies()
    fixed_coreference_df = record.cleaning_coreference()
    # combine character mentions with dependency data
    character_words_df = record.merge_character_words(fixed_coreference_df, fixed_dependency_df)
    # search dependencies for any missed coreference words, then deduplicate
    dedup_aug_characters_df = record.augment_character_words(fixed_coreference_df,
                                                             fixed_dependency_df,
                                                             character_words_df)
    # record anatomical words used in characterization
    body_final_df = record.find_anatomical_characteristics(dedup_aug_characters_df, fixed_dependency_df)
    # record other words used in characterization
    character_final_df = record.get_character_words(dedup_aug_characters_df, fixed_dependency_df)
    # combine those two records of character words
    all_character_words_df = record.combine_all_character_words(body_final_df, character_final_df)
    return all_character_words_df


def get_major_characters(all_character_words_df, threshold):
    character_data = MajorCharacterFinder(all_character_words_df, threshold)

    # count character words and filter for characters who have more word than threshold
    major_character_words_df = character_data.count_character_words()
    return major_character_words_df


def label_character_data(major_character_words_df, file_id):
    # label character data with file_id
    final_character_words_df = major_character_words_df.assign(filename=file_id)
    return final_character_words_df


def main(path_to_file, path_to_anatomy_lexicon, threshold):

    # gather_text: take in a text file location, outputs chunked sentences and filename
    gathered_sentences, file_id = gather_text(path_to_file)

    # annotate_text: take in list of chunked sentences, outputs three different dataframes of annotated data via spaCy
    print("Annotating text file {}".format(file_id))
    coreference_df, dependency_df, entities_df = annotate_text(gathered_sentences)

    # record_characters: take in three dataframes of annotated data, and a path to a text file with anatomical words
    # list of anatomical words lets us search for additional words connected to characters' physiques
    # outputs a dataframe with all words describing characters (anatomical and otherwise)
    all_character_words_df = record_characters(coreference_df, dependency_df, entities_df, path_to_anatomy_lexicon)

    # get_major_characters: take in dataframe of all words used in characterization and take in an integer value
    # "threshold" parameter lets you decide how many words a character, at least, must have in order to be considered
    # for sampling
    major_character_words_df = get_major_characters(all_character_words_df, threshold)

    # label_character_data: simply adds file id to all the character data produced in get_major_characters
    final_character_words_df = label_character_data(major_character_words_df, file_id)
    print("Finished parsing text file {}".format(file_id))
    return final_character_words_df
